library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
data1 <- read_csv("Stats VVS Climate.csv")
## Rows: 5126 Columns: 46
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Country name
## dbl (45): year, Land area below 5m (% of land area), Agricultural land under...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data1
data2 <- read_csv("Final_Dataset.csv")
## Rows: 5771 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Country, SO2_emissions_per_capita, NOx_emissions_per_capita
## dbl (17): Year, X.SO2_change_since_1990, Total_SO2_Emissions, X.CO2_change_s...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data2
data2 <- data2 %>%
mutate(Country = replace(Country, Country == "United States of America", "United States")) %>%
mutate(Country = replace(Country, Country == "United Kingdom of Great Britain and Northern Ireland", "United Kingdom"))
merged_data <- inner_join(data1, data2, by = c("Country name" = "Country", "year" = "Year"))
merged_data
temperatures <- read_csv("temp_country_2013.csv")
## Rows: 150822 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Country
## dbl (3): Month, Year, Temp
## date (1): Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# merged_data %>% group_by(year) %>% select(2:ncol(.)) %>% summarise(is.na(cur_data))
na_fraction <- merged_data %>% mutate(across(3: ncol(.), ~ is.na(merged_data[[cur_column()]]))) %>%
group_by(year) %>% summarise(across(3: ncol(.)-1, mean))
na_fraction_yearwise <- na_fraction%>%
rowwise() %>% mutate(na_per = mean(c_across(3:ncol(.)))) %>%
select(year, na_per, everything())
na_fraction_yearwise
na_fraction_varwise <- na_fraction %>%
summarise(across(3:ncol(.), mean))
na_fraction_varwise
# Fraction of na values in each year
na_fraction_yearwise %>% ggplot(aes(year, na_per)) + geom_col()

# Fraction of na values in each variable
na_fraction_varwise %>% pivot_longer(everything(), names_to = 'var', values_to = "na_per") %>%
ggplot(aes(var, na_per)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=5))

var_names = c("CH4_emissions_per_capita",
"CO2_emissions_per_capita",
#"GDP ($)",
"GHG_emissions_per_capita",
"N2O_emissions_per_capita",
"NOx_emissions_per_capita",
"Population",
"Under-five mortality rate (per 1,000)")
#merged_data %>% select(all_of(var_names)) %>%
# mutate(NOx_emissions_per_capita = parse_double(NOx_emissions_per_capita)) %>%
# #mutate_at(var_names, ~(scale(.) %>% as.vector)) %>%
# pivot_longer(everything(), names_to = 'var', values_to = "values") %>%
# ggplot(aes(x=var, y=values)) + geom_boxplot() +
# theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=10))
vars_with_littleNAs <- merged_data %>% select(all_of(var_names)) %>%
mutate(NOx_emissions_per_capita = parse_double(NOx_emissions_per_capita))
vars_with_littleNAs
mean1 <- vars_with_littleNAs %>% summarise(across(everything(), ~ mean(.x, na.rm=TRUE)))
mean1
std1 <- vars_with_littleNAs %>% summarise(across(everything(), ~ sd(.x, na.rm=TRUE)))
std1
vars_with_littleNAs_norm <- vars_with_littleNAs %>%
mutate(across(everything(), ~ (.x - mean1[[cur_column()]])/std1[[cur_column()]]))
vars_with_littleNAs_norm
# box plot for some variables that have lesser NAs in them
vars_with_littleNAs_norm %>%
pivot_longer(everything(), names_to = 'var', values_to = "values") %>%
filter(!is.na(values)) %>%
ggplot(aes(x=var, y=values)) + geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=10))

# vars_with_littleNAs_norm %>% summarise(min(`Population`, na.rm = TRUE))
# vars_with_littleNAs %>% summarise(min(`Population`, na.rm = TRUE))
# mutate_at(var_names, ~(scale(.) %>% as.vector)) %>% summarise(across(var_names, mean))
vars_with_littleNAs_norm %>%
pivot_longer(everything(), names_to = 'var', values_to = "values") %>%
filter(!is.na(values) & var == "CO2_emissions_per_capita") %>% summarise(mean(values), sd(values))
vars_with_littleNAs %>%
pivot_longer(everything(), names_to = 'var', values_to = "values") %>%
filter(!is.na(values) & var == "CO2_emissions_per_capita") %>% summarise(mean(values), sd(values))
country_names = c("India",
"United States",
"United Kingdom",
"Canada",
"Switzerland")
vars_with_littleNAs_country <- merged_data %>%
select(all_of(c("Country name", "year", var_names))) %>%
filter(`Country name` %in% country_names) %>%
mutate(NOx_emissions_per_capita = parse_double(NOx_emissions_per_capita))
vars_with_littleNAs_country
bar_plot_data <- vars_with_littleNAs_country %>% group_by(`Country name`) %>%
summarise(across(3:ncol(.)-1, ~ mean(.x, na.rm = TRUE)), .groups = "drop") %>%
pivot_longer(2:ncol(.), names_to = "var", values_to = "values") %>%
group_by(var) %>% mutate(values = values/max(values, na.rm = TRUE))
bar_plot_data
# bar plot comparing some variable values among a few countries
bar_plot_data %>%
ggplot(aes(fill=`Country name`, y = values, x = var)) +
geom_bar(position = "dodge", stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=10))
## Warning: Removed 2 rows containing missing values (geom_bar).

temperatures_year_wise <- temperatures %>% group_by(Country, Year) %>%
summarise(Temp = mean(Temp), .groups = "drop") %>%
mutate(Country = replace(Country, Country == "France (Europe)", "France"))
temperatures_year_wise
merged_data_with_temp <- inner_join(merged_data, temperatures_year_wise, by = c("Country name" = "Country", "year" = "Year"))
merged_data_with_temp
unique(merged_data$`Country name`)
## [1] "Afghanistan" "Angola"
## [3] "Albania" "United Arab Emirates"
## [5] "Argentina" "Armenia"
## [7] "Antigua and Barbuda" "Australia"
## [9] "Austria" "Azerbaijan"
## [11] "Burundi" "Belgium"
## [13] "Benin" "Burkina Faso"
## [15] "Bangladesh" "Bulgaria"
## [17] "Bahrain" "Bosnia and Herzegovina"
## [19] "Belarus" "Belize"
## [21] "Brazil" "Barbados"
## [23] "Brunei Darussalam" "Bhutan"
## [25] "Botswana" "Central African Republic"
## [27] "Canada" "Switzerland"
## [29] "Chile" "China"
## [31] "Cameroon" "Cook Islands"
## [33] "Colombia" "Comoros"
## [35] "Costa Rica" "Cuba"
## [37] "Cyprus" "Germany"
## [39] "Djibouti" "Dominica"
## [41] "Denmark" "Dominican Republic"
## [43] "Algeria" "Ecuador"
## [45] "Eritrea" "Spain"
## [47] "Estonia" "Ethiopia"
## [49] "Finland" "Fiji"
## [51] "France" "Gabon"
## [53] "United Kingdom" "Georgia"
## [55] "Ghana" "Guinea"
## [57] "Guinea-Bissau" "Greece"
## [59] "Grenada" "Guatemala"
## [61] "Guyana" "Honduras"
## [63] "Croatia" "Haiti"
## [65] "Hungary" "Indonesia"
## [67] "India" "Ireland"
## [69] "Iraq" "Iceland"
## [71] "Israel" "Italy"
## [73] "Jamaica" "Jordan"
## [75] "Japan" "Kazakhstan"
## [77] "Kenya" "Cambodia"
## [79] "Kiribati" "Kuwait"
## [81] "Lebanon" "Liberia"
## [83] "Liechtenstein" "Sri Lanka"
## [85] "Lesotho" "Lithuania"
## [87] "Luxembourg" "Latvia"
## [89] "Morocco" "Monaco"
## [91] "Madagascar" "Maldives"
## [93] "Mexico" "Marshall Islands"
## [95] "Mali" "Malta"
## [97] "Myanmar" "Montenegro"
## [99] "Mongolia" "Mozambique"
## [101] "Mauritania" "Mauritius"
## [103] "Malawi" "Malaysia"
## [105] "Namibia" "Niger"
## [107] "Nigeria" "Nicaragua"
## [109] "Niue" "Netherlands"
## [111] "Norway" "Nepal"
## [113] "Nauru" "New Zealand"
## [115] "Oman" "Pakistan"
## [117] "Panama" "Peru"
## [119] "Philippines" "Palau"
## [121] "Papua New Guinea" "Poland"
## [123] "Portugal" "Paraguay"
## [125] "Qatar" "Romania"
## [127] "Russian Federation" "Rwanda"
## [129] "Saudi Arabia" "Sudan"
## [131] "Senegal" "Singapore"
## [133] "Solomon Islands" "Sierra Leone"
## [135] "El Salvador" "San Marino"
## [137] "Serbia" "Sao Tome and Principe"
## [139] "Suriname" "Slovenia"
## [141] "Sweden" "Seychelles"
## [143] "Syrian Arab Republic" "Chad"
## [145] "Togo" "Thailand"
## [147] "Tajikistan" "Turkmenistan"
## [149] "Timor-Leste" "Tonga"
## [151] "Trinidad and Tobago" "Tunisia"
## [153] "Turkey" "Tuvalu"
## [155] "Uganda" "Ukraine"
## [157] "Uruguay" "United States"
## [159] "Uzbekistan" "Vanuatu"
## [161] "Samoa" "South Africa"
## [163] "Zambia" "Zimbabwe"
colnames(merged_data_with_temp)
## [1] "Country name"
## [2] "year"
## [3] "Land area below 5m (% of land area)"
## [4] "Agricultural land under irrigation (% of total ag. land)"
## [5] "Cereal yield (kg per hectare)"
## [6] "Foreign direct investment, net inflows (% of GDP)"
## [7] "Access to electricity (% of total population)"
## [8] "Energy use per units of GDP (kg oil eq./$1,000 of 2005 PPP $)"
## [9] "Energy use per capita (kilograms of oil equivalent)"
## [10] "CO2 emissions, total (KtCO2)"
## [11] "CO2 emissions per capita (metric tons)"
## [12] "CO2 emissions per units of GDP (kg/$1,000 of 2005 PPP $)"
## [13] "Other GHG emissions, total (KtCO2e)"
## [14] "Methane (CH4) emissions, total (KtCO2e)"
## [15] "Nitrous oxide (N2O) emissions, total (KtCO2e)"
## [16] "Disaster risk reduction progress score (1-5 scale; 5=best)"
## [17] "GHG net emissions/removals by LUCF (MtCO2e)"
## [18] "Average annual precipitation (1961-1990, mm)"
## [19] "Droughts, floods, extreme temps (% pop. avg. 1990-2009)"
## [20] "Population below 5m (% of total)"
## [21] "Population in urban agglomerations >1million (%)"
## [22] "Annual freshwater withdrawals (% of internal resources)"
## [23] "Nationally terrestrial protected areas (% of total land area)"
## [24] "Ease of doing business (ranking 1-183; 1=best)"
## [25] "Invest. in energy w/ private participation ($)"
## [26] "Invest. in telecoms w/ private participation ($)"
## [27] "Invest. in transport w/ private participation ($)"
## [28] "Invest. in water/sanit. w/ private participation ($)"
## [29] "Public sector mgmt & institutions avg. (1-6 scale; 6=best)"
## [30] "Paved roads (% of total roads)"
## [31] "GDP ($)"
## [32] "GNI per capita (Atlas $)"
## [33] "Ratio of girls to boys in primary & secondary school (%)"
## [34] "Primary completion rate, total (% of relevant age group)"
## [35] "Under-five mortality rate (per 1,000)"
## [36] "Access to improved water source (% of total pop.)"
## [37] "Nurses and midwives (per 1,000 people)"
## [38] "Physicians (per 1,000 people)"
## [39] "Malaria incidence rate (per 100,000 people)"
## [40] "Access to improved sanitation (% of total pop.)"
## [41] "Child malnutrition, underweight (% of under age 5)"
## [42] "Population living below $1.25 a day (% of total)"
## [43] "Population growth (annual %)"
## [44] "Population"
## [45] "Urban population growth (annual %)"
## [46] "Urban population"
## [47] "X.SO2_change_since_1990"
## [48] "SO2_emissions_per_capita"
## [49] "Total_SO2_Emissions"
## [50] "X.CO2_change_since_1990"
## [51] "CO2_emissions_per_capita"
## [52] "Total_CO2_Emissions"
## [53] "X.GHG_change_since_1990"
## [54] "GHG_emissions_per_capita"
## [55] "Total_GHG_Emissions"
## [56] "X.NO2_change_since_1990"
## [57] "N2O_emissions_per_capita"
## [58] "Total_N2O_Emissions"
## [59] "X.Nox_change_since_1990"
## [60] "NOx_emissions_per_capita"
## [61] "Total_NOx_Emissions"
## [62] "X.CH4_change_since_1990"
## [63] "CH4_emissions_per_capita"
## [64] "Total_CH4_Emissions"
## [65] "Temp"
country_names = c("India",
"United States",
"United Kingdom",
"France",
"Switzerland")
# variation if temperature and some other variables with years
merged_data_with_temp %>% #select(3:ncol(.)) %>%
mutate(across(3:ncol(.), as.double)) %>%
filter(`Country name` %in% country_names) %>%
ggplot(aes(year, Temp, color = `Country name`)) + geom_line()

merged_data_with_temp %>% #select(3:ncol(.)) %>%
mutate(across(3:ncol(.), as.double)) %>%
filter(`Country name` %in% country_names) %>%
ggplot(aes(year, `Cereal yield (kg per hectare)`, color = `Country name`)) + geom_line()
## Warning: Removed 12 row(s) containing missing values (geom_path).

merged_data_with_temp %>% #select(3:ncol(.)) %>%
mutate(across(3:ncol(.), as.double)) %>%
filter(`Country name` %in% country_names) %>%
ggplot(aes(year, `CO2 emissions, total (KtCO2)`, color = `Country name`)) + geom_line()
## Warning: Removed 18 row(s) containing missing values (geom_path).

merged_data_with_temp %>% #select(3:ncol(.)) %>%
mutate(across(3:ncol(.), as.double)) %>%
filter(`Country name` %in% country_names) %>%
ggplot(aes(year, `Urban population growth (annual %)`, color = `Country name`)) + geom_line()
## Warning: Removed 6 row(s) containing missing values (geom_path).

merged_data_with_temp %>% #select(3:ncol(.)) %>%
mutate(across(3:ncol(.), as.double)) %>%
filter(`Country name` %in% country_names) %>%
ggplot(aes(year, `GDP ($)`, color = `Country name`)) + geom_line()
## Warning: Removed 6 row(s) containing missing values (geom_path).
